Clickx 47

home *** CD-ROM | disk | FTP | other *** search

/ Clickx 47 / Clickx 47.iso / assets / software / Miro_Installer.exe / Miro_Downloader.exe / rdfa.pyc (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2008-01-10 | 7.8 KB | 297 lines

# Source Generated with Decompyle++ # File: in.pyc (Python 2.5) ''' RDFa parser. RDFa is a set of attributes used to embed RDF in XHTML. An important goal of RDFa is to achieve this RDF embedding without repeating existing XHTML content when that content is the metadata. REFERENCES: \thttp://www.w3.org/2001/sw/BestPractices/HTML/2005-rdfa-syntax Copyright (c) 2006, Elias Torres <elias@torrez.us> Licensed to the public under the GNU GPL v2. ''' import sys import re import urllib import urlparse import cStringIO from xml.dom import pulldom __version__ = '$Id: rdfa.py 118 2006-06-03 18:35:18Z eliast $' rdfa_attribs = [ 'about', 'property', 'rel', 'rev', 'href', 'content'] class NS(unicode): def __getattr__(self, name): return self + name xhtml = NS('http://www.w3.org/1999/xhtml') xml = NS('http://www.w3.org/XML/1998/namespace') rdf = NS('http://www.w3.org/1999/02/22-rdf-syntax-ns#') class Node(unicode): pass class URI(Node): pass class bNode(Node): pass class Literal(Node): def __new__(cls, lit, lang = None, dtype = None): n = '"' + lit + '"' if lang is not None: n += '@' + str(lang) elif dtype is not None: n += '^^<' + str(dtype) + '>' return unicode.__new__(cls, n) _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') def _urljoin(base, uri): uri = _urifixer.sub('\\1\\3', uri) return urlparse.urljoin(base, uri) class RDFaParser: def __init__(self, sink, base = None, lang = None): self.triple = sink.triple if not base: pass self.baseuri = '' if not lang: pass self.lang = None self.abouts = [] self.xmlbases = [] self.langs = [] self.elementStack = [ None] self.bcounter = { } self.bnodes = { } def generateBlankNode(self, parentNode): name = parentNode.tagName if self.bnodes.has_key(parentNode): return self.bnodes[parentNode] if self.bcounter.has_key(name): self.bcounter[name] = self.bcounter[name] + 1 else: self.bcounter[name] = 0 self.bnodes[parentNode] = bNode('_:%s%d' % (name, self.bcounter[name])) return self.bnodes[parentNode] def extractCURIEorURI(self, resource): if len(resource) > 0 and resource[0] == '[' and resource[-1] == ']': resource = resource[1:-1] if resource.find(':') > -1: (rpre, rsuf) = resource.split(':', 1) for nsc in self.handler._ns_contexts: for ns, prefix in nsc.items(): if prefix == rpre: resource = ns + rsuf continue if len(resource) > 0 and resource[0:2] == '_:': return bNode(resource) return URI(self.resolveURI(resource)) def resolveURI(self, uri): if not self.baseuri: pass return _urljoin('', uri) def _popStacks(self, event, node): if len(self.abouts) != 0: (about, aboutnode) = self.abouts[-1] if aboutnode == node: self.abouts.pop() self.elementStack.pop() if self.xmlbases: self.xmlbases.pop() if self.xmlbases and self.xmlbases[-1]: self.baseuri = self.xmlbases[-1] if self.langs: self.langs.pop() if self.langs and self.langs[-1]: self.lang = self.langs[-1] def parse(self, stream): events = pulldom.parse(stream) self.handler = events.pulldom for None in events: (event, node) = None if event == pulldom.START_DOCUMENT: self.abouts += [ (URI(''), node)] if event == pulldom.END_DOCUMENT: if not len(self.elementStack) == 0: raise AssertionError if event == pulldom.START_ELEMENT: self.elementStack += [ node] found = (filter,)((lambda x: x in node.attributes.keys()), rdfa_attribs) if not node.getAttributeNS(xml, 'base') and node.getAttribute('base'): pass baseuri = self.baseuri self.baseuri = _urljoin(self.baseuri, baseuri) self.xmlbases.append(self.baseuri) if node.hasAttributeNS(xml, 'lang') or node.hasAttribute('lang'): if not node.getAttributeNS(xml, 'lang'): pass lang = node.getAttribute('lang') if lang == '': lang = None else: lang = self.lang self.lang = lang self.langs.append(lang) if len(found) == 0: continue parentNode = self.elementStack[-2] if 'about' in found: self.abouts += [ (self.extractCURIEorURI(node.getAttribute('about')), node)] subject = self.abouts[-1][0] if node.tagName == 'meta' or node.tagName == 'link': if 'about' not in found and parentNode: if parentNode.hasAttribute('about'): subject = self.extractCURIEorURI(parentNode.getAttribute('about')) elif parentNode.hasAttributeNS(xml, 'id') or parentNode.hasAttribute('id'): if not parentNode.getAttributeNS(xml, 'id'): pass id = parentNode.getAttribute('id') subject = self.extractCURIEorURI('#' + id) else: subject = self.generateBlankNode(parentNode) if 'property' in found: predicate = self.extractCURIEorURI(node.getAttribute('property')) literal = None datatype = None if node.hasAttribute('datatype'): datatype = self.extractCURIEorURI(node.getAttribute('datatype')) if datatype == 'plaintext': datatype = None if node.hasAttribute('content'): literal = Literal(node.getAttribute('content'), lang = lang, dtype = datatype) else: events.expandNode(node) self._popStacks(event, node) content = '' for child in node.childNodes: content += child.toxml() content = content.strip() literal = Literal(content, dtype = rdf.XMLLiteral) if literal: self.triple(subject, predicate, literal) if 'rel' in found: predicate = self.extractCURIEorURI(node.getAttribute('rel')) if node.hasAttribute('href'): object = self.extractCURIEorURI(node.getAttribute('href')) self.triple(subject, predicate, object) if 'rev' in found: predicate = self.extractCURIEorURI(node.getAttribute('rev')) if node.hasAttribute('href'): object = self.extractCURIEorURI(node.getAttribute('href')) self.triple(object, predicate, subject) if event == pulldom.END_ELEMENT: self._popStacks(event, node) continue class Sink(object): def __init__(self): self.result = '' def __str__(self): return self.result def triple(self, s, p, o): if o.__class__ is URI: o = '<' + o + '>' if s.__class__ is URI: s = '<' + s + '>' self.result += '%s <%s> %s .\n' % (s, p, o) def parseRDFa(s, base = None, sink = None): print 'sink is ', sink if not sink: pass sink = Sink() parser = RDFaParser(sink, base) parser.parse(cStringIO.StringIO(s)) return sink def parseURI(uri, sink = None): return parseRDFa(urllib.urlopen(uri).read(), base = uri, sink = sink) if __name__ == '__main__': if len(sys.argv) != 2: print __doc__ else: print parseURI(sys.argv[1])